library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(UpSetR)
library(apeglm)
library(ashr)
AnnotationSpecies <- "Homo sapiens" # Assign your species
ah <- AnnotationHub(hub=getAnnotationHubOption("URL")) # Bring annotation DB
# Filter annotation of interest
ahQuery <- query(ah, c("OrgDb", AnnotationSpecies))
if (length(ahQuery) == 1) {
DBName <- names(ahQuery)
} else if (length(ahQuery) > 1) {
DBName <- names(ahQuery)[1]
} else {
print("You don't have a valid DB")
rmarkdown::render()
}
AnnoDb <- ah[[DBName]] # Store into an OrgDb object
# Explore your OrgDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="ENSEMBLTRANS")
# Note: Annotation has to be done with not genome but transcripts
AnnoDb <- select(AnnoDb,
AnnoKey,
keytype="ENSEMBLTRANS",
columns="ENSEMBL")
# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
## ENSEMBLTRANS ENSEMBL
## 1 ENST00000543404 ENSG00000256069
## 2 ENST00000566278 ENSG00000256069
## 3 ENST00000545343 ENSG00000256069
## 4 ENST00000544183 ENSG00000256069
## 5 ENST00000286479 ENSG00000156006
## 6 ENST00000520116 ENSG00000156006
.sf files have been created from fastq data by salmon
# This code chunk needs to be written by yourself
#
# Define sample names
SampleNames <- c(paste0("Mock-rep", 1:3), paste0("SARS-CoV-2-rep", 1:3))
# Define group level
GroupLevel <- c("Mock", "Covid")
# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)
# Define a vector for comparing TPM vs Counts effect
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC
# Define .sf file path
sf <- c(paste0("salmon_output/",
SampleNames,
".salmon_quant/quant.sf"))
# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
Group=factor(group, levels=GroupLevel),
Path=sf)
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
## Sample Group
## Mock-rep1 Mock-rep1 Mock
## Mock-rep2 Mock-rep2 Mock
## Mock-rep3 Mock-rep3 Mock
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid
## Path
## Mock-rep1 salmon_output/Mock-rep1.salmon_quant/quant.sf
## Mock-rep2 salmon_output/Mock-rep2.salmon_quant/quant.sf
## Mock-rep3 salmon_output/Mock-rep3.salmon_quant/quant.sf
## SARS-CoV-2-rep1 salmon_output/SARS-CoV-2-rep1.salmon_quant/quant.sf
## SARS-CoV-2-rep2 salmon_output/SARS-CoV-2-rep2.salmon_quant/quant.sf
## SARS-CoV-2-rep3 salmon_output/SARS-CoV-2-rep3.salmon_quant/quant.sf
# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames
# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
countsFromAbundance="lengthScaledTPM", # Extracts TPM
ignoreTxVersion=T)
txi_counts <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
ignoreTxVersion=T)
# tpm
head(txi_tpm$counts)
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1
## ENSG00000000005 7.069840 3.9538769 0.00000 2.004495
## ENSG00000001561 2862.599921 2813.7838311 2505.36278 2443.324902
## ENSG00000002933 15.843325 30.0172780 20.35533 23.846056
## ENSG00000003056 1376.863217 1457.4984469 1408.32364 2236.783582
## ENSG00000003137 6.937593 0.9777292 4.08993 2.006846
## ENSG00000004478 310.042601 359.1166879 241.37580 323.219596
## SARS-CoV-2-rep2 SARS-CoV-2-rep3
## ENSG00000000005 0.991949 3.993719
## ENSG00000001561 1120.670894 2064.616537
## ENSG00000002933 16.130760 22.360242
## ENSG00000003056 1261.607594 1345.676809
## ENSG00000003137 0.000000 3.949410
## ENSG00000004478 249.048081 255.754081
dim(txi_tpm$counts)
## [1] 12202 6
# counts
head(txi_counts$counts)
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000005 7.000 4.000 0.000 2.000 1.000
## ENSG00000001561 2889.000 2782.000 2515.000 2498.000 1102.000
## ENSG00000002933 17.000 32.000 18.000 25.000 14.000
## ENSG00000003056 1423.407 1538.976 1321.634 2218.945 1274.869
## ENSG00000003137 7.000 1.000 4.000 2.000 0.000
## ENSG00000004478 277.049 305.781 224.858 378.571 237.257
## SARS-CoV-2-rep3
## ENSG00000000005 4.000
## ENSG00000001561 2073.000
## ENSG00000002933 24.000
## ENSG00000003056 1318.894
## ENSG00000003137 4.000
## ENSG00000004478 310.414
dim(txi_counts$counts)
## [1] 12202 6
# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) {
# Create a DESeq object (so-calledd dds)
des <- DESeqDataSetFromTximport(txi,
colData=metadata,
design=~Group)
# Create a vsd object (so-called vsd)
ves <- vst(des, blind=T)
# Output them as a list
return(list(dds=des, vsd=ves))
}
TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']]
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]
# TPM
TPM[[1]]
## class: DESeqDataSet
## dim: 12202 6
## metadata(1): version
## assays(1): counts
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
## ENSG00000288642
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000005 7 4 0 2 1
## ENSG00000001561 2863 2814 2505 2443 1121
## ENSG00000002933 16 30 20 24 16
## ENSG00000003056 1377 1457 1408 2237 1262
## ENSG00000003137 7 1 4 2 0
## ENSG00000004478 310 359 241 323 249
## SARS-CoV-2-rep3
## ENSG00000000005 4
## ENSG00000001561 2065
## ENSG00000002933 22
## ENSG00000003056 1346
## ENSG00000003137 4
## ENSG00000004478 256
# Counts
Counts[[1]]
## class: DESeqDataSet
## dim: 12202 6
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
## ENSG00000288642
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000005 7 4 0 2 1
## ENSG00000001561 2889 2782 2515 2498 1102
## ENSG00000002933 17 32 18 25 14
## ENSG00000003056 1423 1539 1322 2219 1275
## ENSG00000003137 7 1 4 2 0
## ENSG00000004478 277 306 225 379 237
## SARS-CoV-2-rep3
## ENSG00000000005 4
## ENSG00000001561 2073
## ENSG00000002933 24
## ENSG00000003056 1319
## ENSG00000003137 4
## ENSG00000004478 310
# TPM
TPM[[2]]
## class: DESeqTransform
## dim: 12202 6
## metadata(1): version
## assays(1): ''
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
## ENSG00000288642
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform
## dim: 12202 6
## metadata(1): version
## assays(1): ''
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
## ENSG00000288642
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
List[[1]] <- estimateSizeFactors(List[[1]])
List[[1]] <- estimateDispersions(List[[1]])
List[[1]] <- nbinomWaldTest(List[[1]])
return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts)
TPM <- DESeqPrep_fn(TPM)
sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## 1.1074118 1.0517854 0.8854318 1.3678281 0.7650195
## SARS-CoV-2-rep3
## 1.0042626
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead!
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
## Sample Group Path
## <factor> <factor> <character>
## Mock-rep1 Mock-rep1 Mock salmon_output/Mock-r..
## Mock-rep2 Mock-rep2 Mock salmon_output/Mock-r..
## Mock-rep3 Mock-rep3 Mock salmon_output/Mock-r..
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid salmon_output/SARS-C..
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid salmon_output/SARS-C..
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid salmon_output/SARS-C..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
## Sample Group Path sizeFactor
## <factor> <factor> <character> <numeric>
## Mock-rep1 Mock-rep1 Mock salmon_output/Mock-r.. 1.107412
## Mock-rep2 Mock-rep2 Mock salmon_output/Mock-r.. 1.051785
## Mock-rep3 Mock-rep3 Mock salmon_output/Mock-r.. 0.885432
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid salmon_output/SARS-C.. 1.367828
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid salmon_output/SARS-C.. 0.765019
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid salmon_output/SARS-C.. 1.004263
# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))
colnames(sizeFactor) <- 'Size_Factor'
sizeFactor <- sizeFactor %>%
rownames_to_column(var="Sample") %>%
inner_join(metadata[, 1:ncol(metadata)-1], by="Sample")
sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)
# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample,
y=Size_Factor,
fill=Group,
label=Size_Factor)) +
geom_bar(stat="identity", width=0.8) +
theme_bw() +
ggtitle("Size Factors in TPM-DESeq") +
geom_text(vjust=1.5) +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")
# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
gather(Sample, Normalization_Factor) %>%
inner_join(metadata[, 1:2], by="Sample")
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors
normFactors_plot <- ggplot(normf,
aes(x=Sample, y=Normalization_Factor)) +
geom_jitter(alpha=0.5, aes(color=Group)) +
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor),
outlier.shape=NA, alpha=0.5) +
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) +
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1,
color="blue",
linetype="dashed")
# Print the normalization factor scatter plot
print(normFactors_plot)
# Print the same plot with larger y-magnification in order to observe the box plot
normFactors_plot +
ylim(0.5, 1.5)
# Assigne what to compare
GroupOfInterest <- Contrast[1]
# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {
plotPCA(inputList[[2]], # takes vsd
intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}
# Set heatmap annotation
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]
# Set a function for a correlation heatmap
QCcorrHeatmap_fn <- function(inputList, Title) {
# Extract transformed count matrix
mtx <- assay(inputList[[2]]) # takes vsd
# Calculate correlation and store in the matrix
mtx <- cor(mtx)
# Create a correlation heatmap
return(pheatmap(mtx,
annotation=HeatmapAnno,
main=paste("Sample Correlation Heatmap:",
Title)))
}
grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"),
QCPCA_fn(Counts, "QC PCA: Counts"),
nrow=2)
# TPM
QCcorrHeatmap_fn(TPM, "TPM")
# Counts
QCcorrHeatmap_fn(Counts, "Counts")
# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]])
names(desList) <- TvC
# Create a list for TPM and Counts dds
ddsList <- desList # DE without shrinkage
normal.ddsList <- desList # DE with "normal" shrinkage
ape.ddsList <- desList # DE with "apeglm" shrinkage
ash.ddsList <- desList # DE with "ashr" shrinkage
for (x in TvC) {
# Run DESeq()
ddsList[[x]] <- DESeq(desList[[x]])
print(resultsNames(ddsList[[x]]))
normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
contrast=Contrast,
type="normal")
ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="apeglm")
ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="ashr")
}
## [1] "Intercept" "Group_Covid_vs_Mock"
## [1] "Intercept" "Group_Covid_vs_Mock"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage
# Plot dispersion
for (x in TvC) {
plotDispEsts(ddsList[[x]],
ylab="Dispersion",
xlab="Mean of Normalized Counts",
main=paste("Dispersion of", x, "Input"))
}
# Set FDR threshold
alpha=0.1
# FDR threshold vector
FDRv=c("< 0.1", "> 0.1")
# Initialize lists of result tables
all.resList <- all.ddsList
# Set a function cleaning table
Sig.fn <- function(df, Input) {
df <- df %>%
rownames_to_column(var="Gene") %>%
mutate(FDR=ifelse(padj < 0.1 & !is.na(padj),
FDRv[1],
FDRv[2]),
Input=Input)
return(df)
}
for (i in shrinkage) {
if (i == "None") {
for (x in TvC) {
# Extract data frames from unshrunken lfc & clean data
all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]],
contrast=Contrast,
alpha=alpha)) %>%
Sig.fn(x)
}
} else {
# Extract data frames from shrunken lfc & clean data
for (x in TvC) {
all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
Sig.fn(x)
}
}
}
# Explore the results
summary(all.resList)
## Length Class Mode
## None 2 -none- list
## Normal 2 -none- list
## Apeglm 2 -none- list
## Ashr 2 -none- list
head(all.resList[[1]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000005 2.812742 0.5987721 1.6141938 0.3709419 7.106808e-01
## 2 ENSG00000001561 2232.914457 0.6069591 0.1459519 4.1586231 3.201717e-05
## 3 ENSG00000002933 20.987678 0.1216964 0.5714864 0.2129472 8.313682e-01
## 4 ENSG00000003056 1474.040931 -0.1332132 0.1492349 -0.8926414 3.720493e-01
## 5 ENSG00000003137 2.872429 1.0848946 1.6061178 0.6754764 4.993732e-01
## 6 ENSG00000004478 284.996011 0.1359892 0.2062583 0.6593152 5.096934e-01
## padj FDR Input
## 1 NA > 0.1 TPM
## 2 0.000713163 < 0.1 TPM
## 3 0.962417071 > 0.1 TPM
## 4 0.760029654 > 0.1 TPM
## 5 NA > 0.1 TPM
## 6 0.850964349 > 0.1 TPM
head(all.resList[[1]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000005 2.856685 0.6047562 1.5942774 0.3793293 7.044433e-01
## 2 ENSG00000001561 2267.273729 0.6080151 0.1463827 4.1536009 3.272838e-05
## 3 ENSG00000002933 21.348392 0.1187991 0.5674509 0.2093557 8.341706e-01
## 4 ENSG00000003056 1495.567472 -0.1320311 0.1496493 -0.8822703 3.776306e-01
## 5 ENSG00000003137 2.911904 1.0944043 1.5875474 0.6893680 4.905917e-01
## 6 ENSG00000004478 286.986305 0.1409851 0.2055848 0.6857757 4.928546e-01
## padj FDR Input
## 1 NA > 0.1 Counts
## 2 0.0006904505 < 0.1 Counts
## 3 0.9589769953 > 0.1 Counts
## 4 0.7522981052 > 0.1 Counts
## 5 NA > 0.1 Counts
## 6 0.8353881362 > 0.1 Counts
head(all.resList[[2]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000005 2.812742 0.13169158 0.3552874 0.3709419 7.106808e-01
## 2 ENSG00000001561 2232.914457 0.58983548 0.1418306 4.1586231 3.201717e-05
## 3 ENSG00000002933 20.987678 0.08415521 0.3954220 0.2129472 8.313682e-01
## 4 ENSG00000003056 1474.040931 -0.12928703 0.1448374 -0.8926414 3.720493e-01
## 5 ENSG00000003137 2.872429 0.23953496 0.3578170 0.6754764 4.993732e-01
## 6 ENSG00000004478 284.996011 0.12852465 0.1949507 0.6593152 5.096934e-01
## padj FDR Input
## 1 NA > 0.1 TPM
## 2 0.000713163 < 0.1 TPM
## 3 0.962417071 > 0.1 TPM
## 4 0.760029654 > 0.1 TPM
## 5 NA > 0.1 TPM
## 6 0.850964349 > 0.1 TPM
head(all.resList[[2]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000005 2.856685 0.1356995 0.3580169 0.3793293 7.044433e-01
## 2 ENSG00000001561 2267.273729 0.5907791 0.1422291 4.1536009 3.272838e-05
## 3 ENSG00000002933 21.348392 0.0825164 0.3944490 0.2093557 8.341706e-01
## 4 ENSG00000003056 1495.567472 -0.1281219 0.1452198 -0.8822703 3.776306e-01
## 5 ENSG00000003137 2.911904 0.2463960 0.3604316 0.6893680 4.905917e-01
## 6 ENSG00000004478 286.986305 0.1332946 0.1943962 0.6857757 4.928546e-01
## padj FDR Input
## 1 NA > 0.1 Counts
## 2 0.0006904505 < 0.1 Counts
## 3 0.9589769953 > 0.1 Counts
## 4 0.7522981052 > 0.1 Counts
## 5 NA > 0.1 Counts
## 6 0.8353881362 > 0.1 Counts
# Set ylim: has to adjusted by users depending on data
yl <- c(-25, 25)
# Set min log2 fold change of interest
mLog <- c(-1, 1)
# Initialize a list storing MA plots
MAList <- ddsList
# Create MA plots
for (i in shrinkage) {
both.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
MAList[[i]] <- ggplot(both.df,
aes(x=baseMean, y=log2FoldChange, color=FDR)) +geom_point() + scale_x_log10() + facet_grid(~Input) +
theme_bw() +
scale_color_manual(values=c("blue", "grey")) +
ggtitle(paste("MA plot with", i)) +
ylim(yl[1], yl[2]) +
geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red")
}
# Print MA plots
MAList
## $TPM
## class: DESeqDataSet
## dim: 12202 6
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
## ENSG00000288642
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
##
## $Counts
## class: DESeqDataSet
## dim: 12202 6
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
## ENSG00000288642
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
##
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']],
all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)
# Plot distribution of FDR
ggplot(both.df,
aes(x=padj, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") +
xlab("Adjusted P-Value (FDR)") +
ylab("Count") +
geom_vline(xintercept=alpha,
color="black",
linetype="dashed",
size=1) +
scale_x_continuous(breaks=seq(0, 1, by=0.1))
# Initialize a lfc list
lfcplotList <- all.resList
# Create lfc distribution plots
for (i in shrinkage) {
lfc.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]
lfc.df$Input <- factor(lfc.df$Input, levels=TvC)
lfcplotList[[i]] <- ggplot(lfc.df, # Subset rows with FDR < alpha
aes(x=log2FoldChange, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() + ylab("Count") +
geom_vline(xintercept=c(mLog[1], mLog[2]),
color="black",
linetype="dashed",
size=1) +
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) +
xlim(-8, 8)
}
# Print the lfc plots
lfcplotList
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Count number of NA genes
type=c("Zero Counts", "Outliers", "Total NA Genes")
# Create a data frame storing NA gene number
NAstat <- both.df %>%
group_by(Input) %>%
summarize(zero=sum(is.na(log2FoldChange)),
outlier=sum(is.na(pvalue) & is.na(padj))) %>%
mutate(total=zero + outlier) %>%
gather(Type, Number, -Input) %>%
mutate(Type=factor(case_when(Type == "zero" ~ type[1],
Type == "outlier" ~ type[2],
Type == "total" ~ type[3]),
levels=type))
# Plot number of NA genes
ggplot(NAstat,
aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
geom_text(position=position_dodge(width=1), vjust=1.5) +
ggtitle("Number of NA Genes") +
ylab("Number of Genes")
# Create a new list having DE table with FDR below alpha
fdr.rank <- all.resList
lfc.rank <- all.resList
up.lfc.rank <- all.resList
down.lfc.rank <- all.resList
# Set a function cleaning a data frame
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == FDRv[1],])}
# Set a function creating a column for the rank
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}
for (i in shrinkage) {
for (x in TvC) {
rankdf <- all.resList[[i]][[x]]
fdr.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(padj) %>% Ranking.fn()
lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()
up.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
arrange(desc(log2FoldChange)) %>%
Ranking.fn()
down.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
arrange(log2FoldChange) %>%
Ranking.fn()
}
}
# Explore the ranking outputs
head(fdr.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000118523 3782.0311 -3.485938 0.1962458 -17.76312 1.364396e-70
## 2: ENSG00000181104 2461.7802 -1.955899 0.1228322 -15.92334 4.364442e-57
## 3: ENSG00000163814 1501.3368 -2.750119 0.1924159 -14.29258 2.434347e-46
## 4: ENSG00000211455 11847.3131 -1.939889 0.1370266 -14.15703 1.689964e-45
## 5: ENSG00000163762 1897.0253 -2.032626 0.1454228 -13.97735 2.143160e-44
## 6: ENSG00000138271 526.3565 -3.596864 0.2642291 -13.61267 3.367067e-42
## padj FDR Input Rank
## 1: 4.984140e-67 < 0.1 TPM 1
## 2: 7.971653e-54 < 0.1 TPM 2
## 3: 2.964223e-43 < 0.1 TPM 3
## 4: 1.543360e-42 < 0.1 TPM 4
## 5: 1.565793e-41 < 0.1 TPM 5
## 6: 2.049982e-39 < 0.1 TPM 6
head(fdr.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000118523 3838.9322 -3.484644 0.1958885 -17.78892 8.612446e-71
## 2: ENSG00000181104 2499.2766 -1.954952 0.1237453 -15.79819 3.201777e-56
## 3: ENSG00000163814 1523.3582 -2.749530 0.1921129 -14.31205 1.840047e-46
## 4: ENSG00000211455 12017.3314 -1.938785 0.1377660 -14.07303 5.563266e-45
## 5: ENSG00000163762 1920.3664 -2.031763 0.1458131 -13.93402 3.935649e-44
## 6: ENSG00000138271 533.8481 -3.595748 0.2628070 -13.68208 1.299139e-42
## padj FDR Input Rank
## 1: 3.016079e-67 < 0.1 Counts 1
## 2: 5.606311e-53 < 0.1 Counts 2
## 3: 2.147949e-43 < 0.1 Counts 3
## 4: 4.870639e-42 < 0.1 Counts 4
## 5: 2.756528e-41 < 0.1 Counts 5
## 6: 7.582642e-40 < 0.1 Counts 6
head(lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000244398 15.032297 -20.412256 3.9097462 -5.220865 1.780897e-07
## 2: ENSG00000248167 14.754788 -7.337706 1.3595099 -5.397317 6.764473e-08
## 3: ENSG00000198796 46.260638 -6.563202 1.5071724 -4.354646 1.332824e-05
## 4: ENSG00000178776 7.011544 -6.267343 1.6282173 -3.849206 1.185015e-04
## 5: ENSG00000263647 6.032185 6.052644 1.6127242 3.753056 1.746919e-04
## 6: ENSG00000205277 31.318717 -5.964592 0.9955208 -5.991429 2.080048e-09
## padj FDR Input Rank
## 1: 7.477720e-06 < 0.1 TPM 1
## 2: 3.013490e-06 < 0.1 TPM 2
## 3: 3.267655e-04 < 0.1 TPM 3
## 4: 2.153661e-03 < 0.1 TPM 4
## 5: 2.927291e-03 < 0.1 TPM 5
## 6: 1.225551e-07 < 0.1 TPM 6
head(lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000244398 15.382334 -20.039517 3.9097199 -5.125563 2.966494e-07
## 2: ENSG00000198796 44.976447 -7.660505 1.3185827 -5.809651 6.260322e-09
## 3: ENSG00000248167 14.908009 -7.331863 1.3535326 -5.416835 6.066326e-08
## 4: ENSG00000178776 7.107135 -6.265639 1.6141288 -3.881747 1.037088e-04
## 5: ENSG00000263647 6.125926 6.056916 1.5988185 3.788370 1.516389e-04
## 6: ENSG00000205277 19.783206 -5.933502 0.9114866 -6.509697 7.530253e-11
## padj FDR Input Rank
## 1: 1.141611e-05 < 0.1 Counts 1
## 2: 3.272186e-07 < 0.1 Counts 2
## 3: 2.590765e-06 < 0.1 Counts 3
## 4: 1.806906e-03 < 0.1 Counts 4
## 5: 2.516775e-03 < 0.1 Counts 5
## 6: 5.860210e-09 < 0.1 Counts 6
head(up.lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000263647 6.032185 6.052644 1.612724 3.753056 1.746919e-04
## 2: ENSG00000271723 19.767862 5.204963 1.040490 5.002416 5.661633e-07
## 3: ENSG00000286053 5.058302 4.795716 1.832224 2.617428 8.859512e-03
## 4: ENSG00000261408 5.481395 4.148588 1.541464 2.691330 7.116780e-03
## 5: ENSG00000251177 8.960407 3.761525 1.093682 3.439321 5.831748e-04
## 6: ENSG00000205111 8.949362 3.715642 1.141438 3.255230 1.133007e-03
## padj FDR Input Rank
## 1: 2.927291e-03 < 0.1 TPM 1
## 2: 2.150265e-05 < 0.1 TPM 2
## 3: 7.035608e-02 < 0.1 TPM 3
## 4: 5.922004e-02 < 0.1 TPM 4
## 5: 8.131059e-03 < 0.1 TPM 5
## 6: 1.406225e-02 < 0.1 TPM 6
head(up.lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000263647 6.125926 6.056916 1.5988185 3.788370 1.516389e-04
## 2: ENSG00000271723 20.267804 4.781448 0.9646888 4.956467 7.178645e-07
## 3: ENSG00000251177 9.087309 3.767220 1.0849146 3.472366 5.158934e-04
## 4: ENSG00000205002 15.239188 3.413080 0.8618122 3.960353 7.483913e-05
## 5: ENSG00000205111 8.562319 3.400694 1.0772793 3.156744 1.595416e-03
## 6: ENSG00000105641 14.467906 3.174780 0.8040172 3.948647 7.859416e-05
## padj FDR Input Rank
## 1: 2.516775e-03 < 0.1 Counts 1
## 2: 2.539355e-05 < 0.1 Counts 2
## 3: 6.975516e-03 < 0.1 Counts 3
## 4: 1.386702e-03 < 0.1 Counts 4
## 5: 1.756964e-02 < 0.1 Counts 5
## 6: 1.435966e-03 < 0.1 Counts 6
head(down.lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000244398 15.032297 -20.412256 3.9097462 -5.220865 1.780897e-07
## 2: ENSG00000248167 14.754788 -7.337706 1.3595099 -5.397317 6.764473e-08
## 3: ENSG00000198796 46.260638 -6.563202 1.5071724 -4.354646 1.332824e-05
## 4: ENSG00000178776 7.011544 -6.267343 1.6282173 -3.849206 1.185015e-04
## 5: ENSG00000205277 31.318717 -5.964592 0.9955208 -5.991429 2.080048e-09
## 6: ENSG00000137673 134.059247 -5.337356 1.1053841 -4.828508 1.375598e-06
## padj FDR Input Rank
## 1: 7.477720e-06 < 0.1 TPM 1
## 2: 3.013490e-06 < 0.1 TPM 2
## 3: 3.267655e-04 < 0.1 TPM 3
## 4: 2.153661e-03 < 0.1 TPM 4
## 5: 1.225551e-07 < 0.1 TPM 5
## 6: 4.696319e-05 < 0.1 TPM 6
head(down.lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSG00000244398 15.382334 -20.039517 3.9097199 -5.125563 2.966494e-07
## 2: ENSG00000198796 44.976447 -7.660505 1.3185827 -5.809651 6.260322e-09
## 3: ENSG00000248167 14.908009 -7.331863 1.3535326 -5.416835 6.066326e-08
## 4: ENSG00000178776 7.107135 -6.265639 1.6141288 -3.881747 1.037088e-04
## 5: ENSG00000205277 19.783206 -5.933502 0.9114866 -6.509697 7.530253e-11
## 6: ENSG00000137673 136.045521 -5.333046 1.1047964 -4.827175 1.384833e-06
## padj FDR Input Rank
## 1: 1.141611e-05 < 0.1 Counts 1
## 2: 3.272186e-07 < 0.1 Counts 2
## 3: 2.590765e-06 < 0.1 Counts 3
## 4: 1.806906e-03 < 0.1 Counts 4
## 5: 5.860210e-09 < 0.1 Counts 5
## 6: 4.369085e-05 < 0.1 Counts 6
# Set a function rebuilding DE tables with gene ranks
rankdiff.fn <- function(List){
# Select columns and join the data frames by gene
full_join(List[[TvC[1]]][, .(Gene, Input, Rank, baseMean)],
List[[TvC[2]]][, .(Gene, Input, Rank, baseMean)],
by="Gene") %>%
# Add columns assining gene expression levels, rank differences, and mean ranks
mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
RankDiff=Rank.x-Rank.y,
MeanRank=(Rank.x+Rank.y)/2)
}
# Set a function adding Shrinkage column
add.shr.fn <- function(df, shr) {mutate(df, Shrinkage=shr)}
# Set a function rbinding multiple data frames
rbinds.fn <- function(List) {rbind(List[[1]],
List[[2]],
List[[3]],
List[[4]])}
# Calculate and plot rank difference
for (i in shrinkage) {
# Calculate rank difference
fdr.rank[[i]] <- rankdiff.fn(fdr.rank[[i]]) %>% add.shr.fn(i)
lfc.rank[[i]] <- rankdiff.fn(lfc.rank[[i]]) %>% add.shr.fn(i)
up.lfc.rank[[i]] <- rankdiff.fn(up.lfc.rank[[i]]) %>% add.shr.fn(i)
down.lfc.rank[[i]] <- rankdiff.fn(down.lfc.rank[[i]]) %>% add.shr.fn(i)
}
# Create combined data frames across the shrinkages
fdr.rank.df <- rbinds.fn(fdr.rank)
lfc.rank.df <- rbinds.fn(lfc.rank)
up.lfc.rank.df <- rbinds.fn(up.lfc.rank)
down.lfc.rank.df <- rbinds.fn(down.lfc.rank)
# Explore the ranking outputs
head(fdr.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000118523 TPM 1 3782.0311 Counts 1 3838.9322
## 2: ENSG00000181104 TPM 2 2461.7802 Counts 2 2499.2766
## 3: ENSG00000163814 TPM 3 1501.3368 Counts 3 1523.3582
## 4: ENSG00000211455 TPM 4 11847.3131 Counts 4 12017.3314
## 5: ENSG00000163762 TPM 5 1897.0253 Counts 5 1920.3664
## 6: ENSG00000138271 TPM 6 526.3565 Counts 6 533.8481
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 8.648484 0 1 None
## 2: 8.219169 0 2 None
## 3: 7.724454 0 3 None
## 4: 9.790094 0 4 None
## 5: 7.957600 0 5 None
## 6: 6.676177 0 6 None
head(lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398 TPM 1 15.032297 Counts 1 15.382334
## 2: ENSG00000248167 TPM 2 14.754788 Counts 3 14.908009
## 3: ENSG00000198796 TPM 3 46.260638 Counts 2 44.976447
## 4: ENSG00000178776 TPM 4 7.011544 Counts 4 7.107135
## 5: ENSG00000263647 TPM 5 6.032185 Counts 5 6.125926
## 6: ENSG00000205277 TPM 6 31.318717 Counts 6 19.783206
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 3.123398 0 1.0 None
## 2: 3.100488 -1 2.5 None
## 3: 4.230460 1 2.5 None
## 4: 2.357557 0 4.0 None
## 5: 2.207741 0 5.0 None
## 6: 3.718689 0 6.0 None
head(up.lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000263647 TPM 1 6.032185 Counts 1 6.125926
## 2: ENSG00000271723 TPM 2 19.767862 Counts 2 20.267804
## 3: ENSG00000286053 TPM 3 5.058302 <NA> NA NA
## 4: ENSG00000261408 TPM 4 5.481395 <NA> NA NA
## 5: ENSG00000251177 TPM 5 8.960407 Counts 3 9.087309
## 6: ENSG00000205111 TPM 6 8.949362 Counts 5 8.562319
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 2.207741 0 1.0 None
## 2: 3.397917 0 2.0 None
## 3: NA NA NA None
## 4: NA NA NA None
## 5: 2.602991 2 4.0 None
## 6: 2.582526 1 5.5 None
head(down.lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398 TPM 1 15.032297 Counts 1 15.382334
## 2: ENSG00000248167 TPM 2 14.754788 Counts 3 14.908009
## 3: ENSG00000198796 TPM 3 46.260638 Counts 2 44.976447
## 4: ENSG00000178776 TPM 4 7.011544 Counts 4 7.107135
## 5: ENSG00000205277 TPM 5 31.318717 Counts 5 19.783206
## 6: ENSG00000137673 TPM 6 134.059247 Counts 6 136.045521
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 3.123398 0 1.0 None
## 2: 3.100488 -1 2.5 None
## 3: 4.230460 1 2.5 None
## 4: 2.357557 0 4.0 None
## 5: 3.718689 0 5.0 None
## 6: 5.308674 0 6.0 None
# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
ranking.plot.fn <- function(df, rankedby) {
df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)
ggplot(df,
aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + facet_grid(~Shrinkage) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Rank with TPM") + ylab("Rank with Counts") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste(rankedby, "Ranking with TPM vs Count Inputs")) + scale_color_gradient(low="blue", high="red")
}
# Print output plots
ranking.plot.fn(fdr.rank.df, "FDR")
ranking.plot.fn(lfc.rank.df, "Log2FoldChange")
ranking.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")
ranking.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")
# Set a function plotting the rank difference over the gene expression level
rankdiff.plot.fn <- function(df, rankedby, ylim) {
df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)
ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) +
geom_point(alpha=0.5) +
theme_bw() +
facet_grid(~Shrinkage) +
theme(strip.text.x=element_text(size=10)) +
ylab("Rank Difference (TPM - Count)") +
ggtitle(paste("Rank Difference (TPM - Count):", rankedby)) +
geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red") +
ylim(-ylim, ylim)
}
# Print output plots
rankdiff.plot.fn(fdr.rank.df, "FDR", 100)
rankdiff.plot.fn(lfc.rank.df, "Log2FoldChange", 120)
rankdiff.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)", 100)
rankdiff.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)", 75)
# Set a function subsetting unshrunken data
unshr.rankdiff.fn <- function(df) {subset(df, Shrinkage == "None")}
# Create a list storing rankdiff data frames
rankList <- list(unshr.rankdiff.fn(fdr.rank.df),
unshr.rankdiff.fn(lfc.rank.df),
unshr.rankdiff.fn(up.lfc.rank.df),
unshr.rankdiff.fn(down.lfc.rank.df))
# Assine result column as a factor with levels
rankdiff.levels <- c("FDR",
"log2FoldChange",
"log2FoldChange.Increase",
"log2FoldChange.Decrease")
# Name the list
names(rankList) <- rankdiff.levels
# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff),
log2FoldChange=abs(rankList[[2]]$RankDiff),
log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff)
rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)
# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) +
geom_boxplot(alpha=0.5, fill="grey", color="black") +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage \n(TPM - Count Input)") +
ylab("Absolute Rank Difference (TPM - Count Input)")
# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>%
group_by(ENSEMBL) %>%
summarize(num.trans=n())
# Set a function adding the number of transcripts by gene id
add.ntrans.fn <- function(df) {
left_join(df, AnnoDb.ntrans, by=c("Gene"="ENSEMBL"))}
# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {
rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}
# Explore the edited data frames
summary(rankList)
## Length Class Mode
## FDR 12 data.table list
## log2FoldChange 12 data.table list
## log2FoldChange.Increase 12 data.table list
## log2FoldChange.Decrease 12 data.table list
head(rankList[[1]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000118523 TPM 1 3782.0311 Counts 1 3838.9322
## 2: ENSG00000181104 TPM 2 2461.7802 Counts 2 2499.2766
## 3: ENSG00000163814 TPM 3 1501.3368 Counts 3 1523.3582
## 4: ENSG00000211455 TPM 4 11847.3131 Counts 4 12017.3314
## 5: ENSG00000163762 TPM 5 1897.0253 Counts 5 1920.3664
## 6: ENSG00000138271 TPM 6 526.3565 Counts 6 533.8481
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 8.648484 0 1 None 1
## 2: 8.219169 0 2 None 2
## 3: 7.724454 0 3 None 3
## 4: 9.790094 0 4 None 13
## 5: 7.957600 0 5 None 5
## 6: 6.676177 0 6 None 2
head(rankList[[2]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398 TPM 1 15.032297 Counts 1 15.382334
## 2: ENSG00000248167 TPM 2 14.754788 Counts 3 14.908009
## 3: ENSG00000198796 TPM 3 46.260638 Counts 2 44.976447
## 4: ENSG00000178776 TPM 4 7.011544 Counts 4 7.107135
## 5: ENSG00000263647 TPM 5 6.032185 Counts 5 6.125926
## 6: ENSG00000205277 TPM 6 31.318717 Counts 6 19.783206
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 3.123398 0 1.0 None 1
## 2: 3.100488 -1 2.5 None 70
## 3: 4.230460 1 2.5 None 6
## 4: 2.357557 0 4.0 None 3
## 5: 2.207741 0 5.0 None 1
## 6: 3.718689 0 6.0 None 6
head(rankList[[3]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000263647 TPM 1 6.032185 Counts 1 6.125926
## 2: ENSG00000271723 TPM 2 19.767862 Counts 2 20.267804
## 3: ENSG00000286053 TPM 3 5.058302 <NA> NA NA
## 4: ENSG00000261408 TPM 4 5.481395 <NA> NA NA
## 5: ENSG00000251177 TPM 5 8.960407 Counts 3 9.087309
## 6: ENSG00000205111 TPM 6 8.949362 Counts 5 8.562319
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 2.207741 0 1.0 None 1
## 2: 3.397917 0 2.0 None 4
## 3: NA NA NA None 4
## 4: NA NA NA None 3
## 5: 2.602991 2 4.0 None 1
## 6: 2.582526 1 5.5 None 4
head(rankList[[4]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398 TPM 1 15.032297 Counts 1 15.382334
## 2: ENSG00000248167 TPM 2 14.754788 Counts 3 14.908009
## 3: ENSG00000198796 TPM 3 46.260638 Counts 2 44.976447
## 4: ENSG00000178776 TPM 4 7.011544 Counts 4 7.107135
## 5: ENSG00000205277 TPM 5 31.318717 Counts 5 19.783206
## 6: ENSG00000137673 TPM 6 134.059247 Counts 6 136.045521
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 3.123398 0 1.0 None 1
## 2: 3.100488 -1 2.5 None 70
## 3: 4.230460 1 2.5 None 6
## 4: 2.357557 0 4.0 None 3
## 5: 3.718689 0 5.0 None 6
## 6: 5.308674 0 6.0 None 3
# Set a function plotting rank difference vs number of transcripts
rank.ntrans.plot.fn <- function(df, title) {
ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) +
geom_jitter(alpha=0.5) +
theme_bw() +
ggtitle(paste("Rank Difference vs Number of Alternative Transcripts \nin", title)) +
xlab("Number of Alternative Transcripts") +
ylab("Absolute Rank Difference \n(TPM - Counts)") + scale_color_gradient(low="blue", high="red")
}
# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")
rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")
rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")
rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")
# Initialize a list storing rankdiff genes
large.rankdiff <- rankList
# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)
names(rankdiff.thr) <- rankdiff.levels
for (x in rankdiff.levels) {
# Filter out observations below the rankdiff thresholds
large.rankdiff[[x]] <- subset(rankList[[x]],
abs(RankDiff) > rankdiff.thr[x])
}
# Explore the filtered genes
summary(large.rankdiff)
## Length Class Mode
## FDR 12 data.table list
## log2FoldChange 12 data.table list
## log2FoldChange.Increase 12 data.table list
## log2FoldChange.Decrease 12 data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 65 12
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1] 5 12
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1] 7 12
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1] 4 12
# Set a function cleaning data to generate upset plots
upset.input.fn <- function(df) {
df <- df %>%
# Filter genes with valid padj
filter(!is.na(padj)) %>%
mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes?
Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""), # What are downregulated genes?
Unchanged=ifelse(FDR == FDRv[2], Gene, ""), # What are unchanged genes?
TPM_Input=ifelse(Input == "TPM", Gene, ""), # What are the genes from TPM input?
Counts_Input=ifelse(Input == "Counts", Gene, "")) # What are the genes from Counts input?
# Create a list storing groups of interest
upsetInput <- list(Up=df$Up,
Down=df$Down,
Unchanged=df$Unchanged,
TPM_Input=df$TPM,
Counts_Input=df$Counts)
return(upsetInput)
}
upsetList <- upset.input.fn(both.df)
# Create the upset plot
upset(fromList(upsetList),
sets=names(upsetList), # What group to display
sets.x.label="Number of Genes per Group",
order.by="freq",
point.size=3,
sets.bar.color=c("red", "red","blue", "blue", "blue"),
text.scale = 1.5, number.angles=30)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] AnnotationDbi_1.52.0 ashr_2.2-47
## [3] apeglm_1.12.0 UpSetR_1.4.0
## [5] gridExtra_2.3 pheatmap_1.0.12
## [7] DESeq2_1.30.0 SummarizedExperiment_1.20.0
## [9] Biobase_2.50.0 MatrixGenerics_1.2.0
## [11] matrixStats_0.57.0 GenomicRanges_1.42.0
## [13] GenomeInfoDb_1.26.2 IRanges_2.24.0
## [15] S4Vectors_0.28.1 tximport_1.18.0
## [17] forcats_0.5.0 stringr_1.4.0
## [19] dplyr_1.0.2 purrr_0.3.4
## [21] readr_1.4.0 tidyr_1.1.2
## [23] tibble_3.0.4 ggplot2_3.3.2
## [25] tidyverse_1.3.0 AnnotationHub_2.22.0
## [27] BiocFileCache_1.14.0 dbplyr_2.0.0
## [29] BiocGenerics_0.36.0 rmarkdown_2.5
## [31] data.table_1.13.4
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] XVector_0.30.0 fs_1.5.0
## [5] rstudioapi_0.13 farver_2.0.3
## [7] bit64_4.0.5 mvtnorm_1.1-1
## [9] interactiveDisplayBase_1.28.0 fansi_0.4.1
## [11] lubridate_1.7.9.2 xml2_1.3.2
## [13] splines_4.0.3 geneplotter_1.68.0
## [15] knitr_1.30 jsonlite_1.7.2
## [17] broom_0.7.2 annotate_1.68.0
## [19] shiny_1.5.0 BiocManager_1.30.10
## [21] compiler_4.0.3 httr_1.4.2
## [23] backports_1.2.1 assertthat_0.2.1
## [25] Matrix_1.2-18 fastmap_1.0.1
## [27] cli_2.2.0 later_1.1.0.1
## [29] htmltools_0.5.0 tools_4.0.3
## [31] coda_0.19-4 gtable_0.3.0
## [33] glue_1.4.2 GenomeInfoDbData_1.2.4
## [35] rappdirs_0.3.1 Rcpp_1.0.5
## [37] bbmle_1.0.23.1 cellranger_1.1.0
## [39] vctrs_0.3.5 xfun_0.19
## [41] ps_1.5.0 rvest_0.3.6
## [43] irlba_2.3.3 mime_0.9
## [45] lifecycle_0.2.0 XML_3.99-0.5
## [47] MASS_7.3-53 zlibbioc_1.36.0
## [49] scales_1.1.1 hms_0.5.3
## [51] promises_1.1.1 RColorBrewer_1.1-2
## [53] yaml_2.2.1 curl_4.3
## [55] memoise_1.1.0 emdbook_1.3.12
## [57] bdsmatrix_1.3-4 SQUAREM_2020.5
## [59] stringi_1.5.3 RSQLite_2.2.1
## [61] BiocVersion_3.12.0 genefilter_1.72.0
## [63] BiocParallel_1.24.1 truncnorm_1.0-8
## [65] rlang_0.4.9 pkgconfig_2.0.3
## [67] bitops_1.0-6 invgamma_1.1
## [69] evaluate_0.14 lattice_0.20-41
## [71] labeling_0.4.2 bit_4.0.4
## [73] tidyselect_1.1.0 plyr_1.8.6
## [75] magrittr_2.0.1 R6_2.5.0
## [77] generics_0.1.0 DelayedArray_0.16.0
## [79] DBI_1.1.0 pillar_1.4.7
## [81] haven_2.3.1 withr_2.3.0
## [83] mixsqp_0.3-43 survival_3.2-7
## [85] RCurl_1.98-1.2 modelr_0.1.8
## [87] crayon_1.3.4 locfit_1.5-9.4
## [89] grid_4.0.3 readxl_1.3.1
## [91] blob_1.2.1 reprex_0.3.0
## [93] digest_0.6.27 xtable_1.8-4
## [95] numDeriv_2016.8-1.1 httpuv_1.5.4
## [97] munsell_0.5.0